431 Class 23

Thomas E. Love, Ph.D.

2024-11-19

Today’s Agenda

  • An example from NHANES 8/2021 - 8/2023
  • Dealing with Missing Data, Partitioning, etc.
  • Most of the things we’ve done previously, plus…
  • Automated Variable Selection in Multiple Regression
    • Stepwise regression (AIC-based backwards elimination)
    • Best subsets (based on Mallows’ \(C_p\))
  • Some rudimentary cross-validation approaches

Today’s Packages

library(nhanesA)
library(naniar)
library(janitor)
library(broom)
library(gt)
library(glue)
library(car)
library(GGally)
library(mice)
library(patchwork)
library(olsrr)       ## for best subsets
library(xfun) 
library(easystats)
library(tidyverse)

theme_set(theme_bw())

Today’s Data Source

NHANES August 2021 - August 2023 Data

In the next two slides, I’ll show the work I did to create a raw data set containing data on subjects from this administration of NHANES, using the nhanesA package, but I won’t execute that code.

Instead, I’ve provided an R data set of the raw data (nh_L_raw.Rds) which we’ll use shortly.

Outcome and Potential Predictors

Today, we will try to predict waist circumference (in cm) using some combination of these 12 candidate predictors:

  • body weight (in kg): expected to be primary driver
  • demographics: age, sex, race/ethnicity, marital status, educational attainment, and insurance status
  • systolic and diastolic blood pressure (1st reading), total cholesterol, self-rated oral health, sedentary minutes/day

Today’s Data

Source: NHANES August 2021 - August 2023 Data

demoL <- nhanes('DEMO_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
bpxoL <- nhanes('BPXO_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
bmxL <- nhanes('BMX_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
tcholL <- nhanes('TCHOL_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
hscrpL <- nhanes('HSCRP_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
hiqL <- nhanes('HIQ_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
ohqL <- nhanes('OHQ_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))
paqL <- nhanes('PAQ_L', translated = FALSE) |> 
  tibble() |> mutate(SEQN = as.character(SEQN))

Aggregating the Data

t12 <- left_join(demoL, bpxoL, by = "SEQN")
t123 <- left_join(t12, bmxL, by = "SEQN")
t1234 <- left_join(t123, tcholL, by = "SEQN")
t12345 <- left_join(t1234, hiqL, by = "SEQN")
t123456 <- left_join(t12345, ohqL, by = "SEQN")
nh_L_raw <- left_join(t123456, paqL, by = "SEQN")

write_rds(nh_L_raw, "c23/data/nh_L_raw.Rds")

Pruning the Rows

Let’s ingest the raw data from our Rds file.

Then we want to focus on adults ages 21-79 (Age is in the RIDAGEYR variable) who were both interviewed and MEC examined (RIDSTATR = 2)

nh_L_raw <- read_rds("c23/data/nh_L_raw.Rds")
dim(nh_L_raw)
[1] 11933    86
nh_L <- nh_L_raw |> 
  filter(RIDSTATR == 2,
         RIDAGEYR >= 21 & RIDAGEYR <= 79)
dim(nh_L)
[1] 5669   86

Pruning the Variables

… and replacing Refused, or Don’t Know codes with NA.

nh_L <- nh_L |>
  select(SEQN, RIAGENDR, RIDAGEYR, RIDRETH3, DMDEDUC2, DMDMARTZ,
         BPXOSY1, BPXODI1, BMXWT, BMXWAIST, LBXTC, 
         HIQ011, HIQ032A, HIQ032B, HIQ032D, 
         OHQ845, PAD680, WTINT2YR, WTMEC2YR) |>
  replace_with_na(replace = 
         list(DMDEDUC2 = c(7, 9), DMDMARTZ = c(77, 99), HIQ011 = c(7, 9), 
              HIQ032A = c(77, 99), OHQ845 = c(7, 9), PAD680 = c(7777, 9999)))

Factor Cleaning (1/2)

nh_L <- nh_L |> 
  mutate(sex = fct_recode(factor(RIAGENDR), "M" = "1", "F" = "2")) |>
  mutate(race_eth = fct_recode(factor(RIDRETH3), 
            "Hispanic" = "1", "Hispanic" = "2", "NH_White" = "3", 
            "NH_Black" = "4", "NH_Asian" = "6", "Other" = "7")) |>
  mutate(educ = fct_recode(factor(DMDEDUC2),
            "NonHSGrad" = "1", "NonHSGrad" = "2", "HSGrad" = "3",
            "SomeCollege" = "4", "CollegeGrad" = "5")) |>
  mutate(marital = fct_recode(factor(DMDMARTZ),
            "Married" = "1", "Formerly" = "2", "Never" = "3"))

Factor Cleaning (2/2)

nh_L <- nh_L |> 
  mutate(insur = factor(case_when(
    HIQ011 == 2 ~ "Uninsured",
    HIQ011 == 1 & HIQ032D == 4 ~ "Medicaid",
    HIQ011 == 1 & HIQ032B == 2 ~ "Medicare",
    HIQ011 == 1 & HIQ032A == 1 ~ "Commercial"))) |>
  mutate(insur = fct_relevel(insur, "Medicare")) |>
  mutate(oral_h = fct_recode(factor(OHQ845),
            "E" = "1", "VG" = "2", "G" = "3", "F" = "4", "P" = "5"))

Variable Renaming

nh_L <- nh_L |> 
  rename(age = RIDAGEYR, sbp1 = BPXOSY1, dbp1 = BPXODI1, 
         wt_kg = BMXWT, waist = BMXWAIST, tot_chol = LBXTC, 
         sedentary = PAD680)

Final Set: 16 Variables

nh_L <- nh_L |> 
  select(SEQN, sex, age, race_eth, educ, marital, insur, 
         sbp1, dbp1, wt_kg, waist, tot_chol, oral_h, 
         sedentary, WTINT2YR, WTMEC2YR)

dim(nh_L)
[1] 5669   16

Variable Descriptions (1/3)

Variable Description NA
SEQN Subject identifier code 0
sex M or F 0
age age in years (21-79) 0
race_eth race/ethnicity (5 categories) 0
educ educational attainment (4 categories) 3
marital marital status (3 categories) 4

Variable Descriptions (2/3)

Variable Description NA
insur primary insurance (4 categories) 557
sbp1 1st Systolic BP reading (mm Hg) 177
dbp1 1st Diastolic BP reading (mm Hg) 177
wt_kg Body weight (kg) 70
waist Waist circumference (cm) 259
tot_chol Total Cholesterol (mg/dl) 516

Variable Descriptions (3/3)

Variable Description NA
oral_h self-reported oral health (5 levels) 9
sedentary typical day: sedentary activity (minutes) 39
WTINT2YR sampling weight for interview items 0
WTMEC2YR sampling weight for examination items 0

data_codebook() results

data_codebook(nh_L |> select(-SEQN, -WTINT2YR, -WTMEC2YR))
select(nh_L, -SEQN, -WTINT2YR, -WTMEC2YR) (5669 rows and 13 variables, 13 shown)

ID | Name      | Type        |   Missings |        Values |            N
---+-----------+-------------+------------+---------------+-------------
1  | sex       | categorical |   0 (0.0%) |             M | 2531 (44.6%)
   |           |             |            |             F | 3138 (55.4%)
---+-----------+-------------+------------+---------------+-------------
2  | age       | numeric     |   0 (0.0%) |      [21, 79] |         5669
---+-----------+-------------+------------+---------------+-------------
3  | race_eth  | categorical |   0 (0.0%) |      Hispanic |  965 (17.0%)
   |           |             |            |      NH_White | 3298 (58.2%)
   |           |             |            |      NH_Black |  723 (12.8%)
   |           |             |            |      NH_Asian |  316 ( 5.6%)
   |           |             |            |         Other |  367 ( 6.5%)
---+-----------+-------------+------------+---------------+-------------
4  | educ      | categorical |   3 (0.1%) |     NonHSGrad |  688 (12.1%)
   |           |             |            |        HSGrad | 1192 (21.0%)
   |           |             |            |   SomeCollege | 1741 (30.7%)
   |           |             |            |   CollegeGrad | 2045 (36.1%)
---+-----------+-------------+------------+---------------+-------------
5  | marital   | categorical |   4 (0.1%) |       Married | 3112 (54.9%)
   |           |             |            |      Formerly | 1365 (24.1%)
   |           |             |            |         Never | 1188 (21.0%)
---+-----------+-------------+------------+---------------+-------------
6  | insur     | categorical | 557 (9.8%) |      Medicare | 1595 (31.2%)
   |           |             |            |    Commercial | 2217 (43.4%)
   |           |             |            |      Medicaid |  804 (15.7%)
   |           |             |            |     Uninsured |  496 ( 9.7%)
---+-----------+-------------+------------+---------------+-------------
7  | sbp1      | numeric     | 177 (3.1%) |     [75, 232] |         5492
---+-----------+-------------+------------+---------------+-------------
8  | dbp1      | numeric     | 177 (3.1%) |     [40, 142] |         5492
---+-----------+-------------+------------+---------------+-------------
9  | wt_kg     | numeric     |  70 (1.2%) | [27.9, 248.2] |         5599
---+-----------+-------------+------------+---------------+-------------
10 | waist     | numeric     | 259 (4.6%) |     [60, 187] |         5410
---+-----------+-------------+------------+---------------+-------------
11 | tot_chol  | numeric     | 516 (9.1%) |     [62, 438] |         5153
---+-----------+-------------+------------+---------------+-------------
12 | oral_h    | categorical |   9 (0.2%) |             E |  710 (12.5%)
   |           |             |            |            VG | 1391 (24.6%)
   |           |             |            |             G | 1872 (33.1%)
   |           |             |            |             F | 1011 (17.9%)
   |           |             |            |             P |  676 (11.9%)
---+-----------+-------------+------------+---------------+-------------
13 | sedentary | numeric     |  39 (0.7%) |     [0, 1380] |         5630
------------------------------------------------------------------------

Missing Data by Variable

miss_var_summary(nh_L) |> 
  slice(1:8)
# A tibble: 8 × 3
  variable  n_miss pct_miss
  <chr>      <int>    <num>
1 insur        557    9.83 
2 tot_chol     516    9.10 
3 waist        259    4.57 
4 sbp1         177    3.12 
5 dbp1         177    3.12 
6 wt_kg         70    1.23 
7 sedentary     39    0.688
8 oral_h         9    0.159
miss_var_summary(nh_L) |> 
  slice(9:16)
# A tibble: 8 × 3
  variable n_miss pct_miss
  <chr>     <int>    <num>
1 marital       4   0.0706
2 educ          3   0.0529
3 SEQN          0   0     
4 sex           0   0     
5 age           0   0     
6 race_eth      0   0     
7 WTINT2YR      0   0     
8 WTMEC2YR      0   0     

Missing Data

dim(nh_L)
[1] 5669   16
miss_case_table(nh_L)
# A tibble: 6 × 3
  n_miss_in_case n_cases pct_cases
           <int>   <int>     <dbl>
1              0    4357    76.9  
2              1    1005    17.7  
3              2     180     3.18 
4              3      75     1.32 
5              4      39     0.688
6              5      13     0.229
pct_miss_case(nh_L)
[1] 23.14341

Missingness Mechanism

Can we assume the data are MCAR, per Little’s test?

mcar_test(nh_L)
# A tibble: 1 × 4
  statistic    df p.value missing.patterns
      <dbl> <dbl>   <dbl>            <int>
1     1022.   532       0               41
  • Looks like we’ll need to assume MAR and do some imputation.

Outcome and Potential Predictors

Again, our goal is to try to predict waist circumference (in cm) using some combination of these 12 candidate predictors:

  • body weight (in kg): expected to be primary driver
  • demographics: age, sex, race/ethnicity, marital status, educational attainment, and insurance status
  • systolic and diastolic blood pressure, total cholesterol, self-rated oral health, sedentary minutes/day

We’ll generate several possible models.

Single imputation, then partition

set.seed(202411191)
nh_i1 <- mice(nh_L, m = 1, printFlag = FALSE)
nh_si <- complete(nh_i1) ## si for single imputation
n_miss(nh_si)
[1] 0

We’ll use this single imputation for decision-making…

Partition the Data

c(nrow(nh_si), n_distinct(nh_si$SEQN)) # check that SEQNs are all distinct
[1] 5669 5669
set.seed(202411192)

nh_train <- nh_si |> slice_sample(n = 4000, replace = FALSE)
nh_test <- anti_join(nh_si, nh_train, by = "SEQN")

Waist Circumference (our outcome)

Transform our Outcome?

t_check <- lm(waist ~ wt_kg + age + sex + race_eth + marital + educ + 
                insur + sbp1 + dbp1 + tot_chol + oral_h + sedentary,
              data = nh_train)
boxCox(t_check)

Collinearity?

vif(t_check)
              GVIF Df GVIF^(1/(2*Df))
wt_kg     1.236757  1        1.112096
age       2.312126  1        1.520568
sex       1.140010  1        1.067712
race_eth  1.447115  4        1.047280
marital   1.350003  2        1.077913
educ      1.456866  3        1.064723
insur     2.744398  3        1.183245
sbp1      2.204324  1        1.484697
dbp1      2.003562  1        1.415472
tot_chol  1.103807  1        1.050622
oral_h    1.225628  4        1.025758
sedentary 1.118783  1        1.057725

Automated Predictor Selection

Stepwise Regression Process

We start with a big model, including all candidate predictors.

  • R looks at each variable in turn to see what would happen to AIC if we dropped it from the big model.
  • It then drops the variable that most reduces the AIC, creating the first step down.
  • Then it repeats the process with the newly stepped down model (where one variable has already been dropped) and drops the variable that most reduces the AIC now.
  • When it can no longer drop a variable and reduce the AIC, the process stops.

Stepwise Regression

We’ll start with our 12-predictor model, predicting our untransformed waist circumference.

fit_all12 <- lm(waist ~ wt_kg + age + sex + race_eth + marital + educ + 
                  insur + sbp1 + dbp1 + tot_chol + oral_h + sedentary,
                data = nh_train)

Here’s one way to run a backwards stepwise elimination search of our regression model, using AIC as the measure to tell us when to stop removing variables.

step_res <- step(fit_all12, direction = "backward")

Results on the next slide.

Stepwise Regression

Start:  AIC=14453.44
waist ~ wt_kg + age + sex + race_eth + marital + educ + insur + 
    sbp1 + dbp1 + tot_chol + oral_h + sedentary

            Df Sum of Sq    RSS   AIC
- marital    2        20 146612 14450
- tot_chol   1        16 146608 14452
- sbp1       1        35 146627 14452
<none>                   146592 14453
- sedentary  1       489 147081 14465
- dbp1       1       639 147231 14469
- insur      3      1099 147691 14477
- oral_h     4      1262 147854 14480
- educ       3      3259 149851 14535
- race_eth   4      5295 151887 14587
- sex        1     11840 158433 14762
- age        1     18184 164776 14919
- wt_kg      1    851358 997950 22124

Step:  AIC=14449.99
waist ~ wt_kg + age + sex + race_eth + educ + insur + sbp1 + 
    dbp1 + tot_chol + oral_h + sedentary

            Df Sum of Sq    RSS   AIC
- tot_chol   1        18 146630 14448
- sbp1       1        36 146648 14449
<none>                   146612 14450
- sedentary  1       482 147095 14461
- dbp1       1       639 147252 14465
- insur      3      1105 147718 14474
- oral_h     4      1280 147892 14477
- educ       3      3298 149910 14533
- race_eth   4      5465 152077 14588
- sex        1     12143 158756 14766
- age        1     20837 167450 14980
- wt_kg      1    851827 998440 22122

Step:  AIC=14448.48
waist ~ wt_kg + age + sex + race_eth + educ + insur + sbp1 + 
    dbp1 + oral_h + sedentary

            Df Sum of Sq     RSS   AIC
- sbp1       1        39  146670 14448
<none>                    146630 14448
- sedentary  1       481  147112 14460
- dbp1       1       697  147328 14466
- insur      3      1089  147719 14472
- oral_h     4      1273  147904 14475
- educ       3      3283  149913 14531
- race_eth   4      5560  152191 14589
- sex        1     12325  158955 14769
- age        1     21210  167840 14987
- wt_kg      1    860219 1006849 22153

Step:  AIC=14447.55
waist ~ wt_kg + age + sex + race_eth + educ + insur + dbp1 + 
    oral_h + sedentary

            Df Sum of Sq     RSS   AIC
<none>                    146670 14448
- sedentary  1       487  147156 14459
- dbp1       1       891  147561 14470
- insur      3      1077  147746 14471
- oral_h     4      1277  147946 14474
- educ       3      3248  149917 14529
- race_eth   4      5658  152327 14591
- sex        1     12980  159650 14785
- age        1     22396  169066 15014
- wt_kg      1    880112 1026781 22230

So What Happened?

  • The big model (12 predictors) had an AIC of 14422.72
  • Dropping marital reduced the AIC to 14419.05
  • Then dropping tot_chol reduced the AIC to 14417.92
  • Then dropping sbp1 reduced the AIC to 14417.22
  • but at that point, no additional variable removals reduce the AIC. The best we can do is to drop sedentary but that increases AIC back to 14426.

Stepwise Regression Result

So we drop three of the 12 predictors (marital, tot_chol and sbp1) and keep the remaining nine (wt_kg, age, sex, race_eth, educ, income, oral_h, dbp1, and sedentary).

fit_step9 <- lm(waist ~ wt_kg + age + sex + race_eth + educ + 
                  insur + dbp1 + oral_h + sedentary, data = nh_train)

It turns out that stepwise regression is a pretty poor approach, even by the standards of automated variable selection procedures. Are there better options?

  • Can we generate other competitive candidate predictor sets?

A Best Subsets Approach

Again, we start with our big model fit_all12, with all 12 predictors, but now we use a function from the olsrr package to complete some automated variable selection1.

nh_best_res <- 
  ols_step_best_subset(fit_all12, max_order = 6, metric = "cp")

This code identifies the best subsets of variables according to Mallows’ \(C_p\) statistic, while restricting the search to models with 1-6 variables (of our original 12) included. Results on the next slide…

We’ll compare four options

I’ve somewhat arbitrarily selected1 these four options.

fit_12 <- lm(waist ~ wt_kg + age + sex + race_eth + marital + educ + 
                  insur + sbp1 + dbp1 + tot_chol + oral_h + sedentary,
                data = nh_train)

fit_09 <- lm(waist ~ wt_kg + age + sex + race_eth + educ + 
                  insur + dbp1 + oral_h + sedentary, data = nh_train)

fit_04 <- lm(waist ~ wt_kg + age + sex + educ, data = nh_train)

fit_01 <- lm(waist ~ wt_kg, data = nh_train)

Posterior Predictive Checks (fit_12)

set.seed(43101); check_model(fit_12, check = "pp_check")

Posterior Predictive Checks (fit_09)

set.seed(43102); check_model(fit_09, check = "pp_check")

Posterior Predictive Checks (fit_04)

set.seed(43103); check_model(fit_04, check = "pp_check")

Posterior Predictive Checks (fit_01)

set.seed(43104); check_model(fit_01, check = "pp_check")

Checking Model fit_04

set.seed(43105); check_model(fit_04)

Training Sample Performance

plot(compare_performance(fit_12, fit_09, fit_04, fit_01))

Training Sample Performance

compare_performance(fit_12, fit_09, fit_04, fit_01, rank = TRUE) |> 
  gt() |> fmt_number(decimals = 3) |> tab_options(table.font.size = 24)
Name Model R2 R2_adjusted RMSE Sigma AIC_wt AICc_wt BIC_wt Performance_Score
fit_09 lm 0.879 0.878 6.055 6.071 0.950 0.952 1.000 1.000
fit_12 lm 0.879 0.878 6.054 6.072 0.050 0.048 0.000 0.586
fit_04 lm 0.871 0.871 6.236 6.241 0.000 0.000 0.000 0.504
fit_01 lm 0.814 0.814 7.492 7.494 0.000 0.000 0.000 0.000

Cross-Validation Performance

Within the training sample, we might try some cross-validation.

set.seed(2024111903)
performance_accuracy(fit_12, method = "cv", k = 5)
# Accuracy of Model Predictions

Accuracy (95% CI): 93.65% [92.64%, 94.15%]
Method: Correlation between observed and predicted
performance_cv(fit_12)
# Cross-validation performance (30% holdout method)

MSE | RMSE |   R2
-----------------
39  |  6.2 | 0.88

This is something we’ll do in 432, too.

Cross-Validation Results

across all four models we’re considering…

Model Accuracy \(MSE_{30}\) \(RMSE_{30}\) \(R^2_{30}\)
fit_12 0.9366 38 6.1 0.88
fit_09 0.9369 36 6 0.88
fit_04 0.9335 38 6.2 0.87
fit_01 0.9022 56 7.5 0.81
  • Accuracy = correlation of waist\(_{observed}\), waist\(_{predicted}\).
  • Other summaries use 30% holdout approach.

Predicting into Test Sample

aug01_test <- augment(fit_01, newdata = nh_test) |> mutate(mod = "fit_01")
aug04_test <- augment(fit_04, newdata = nh_test) |> mutate(mod = "fit_04")
aug09_test <- augment(fit_09, newdata = nh_test) |> mutate(mod = "fit_09")
aug12_test <- augment(fit_12, newdata = nh_test) |> mutate(mod = "fit_12")

temp14 <- bind_rows(aug01_test, aug04_test)
temp149 <- bind_rows(temp14, aug09_test)
test_res <- bind_rows(temp149, aug12_test) |>
  relocate(SEQN, mod, waist, everything()) |> arrange(SEQN, mod)

Four Summary Measures

test_summary <- test_res |> 
  group_by(mod) |>
  summarize(MAPE = mean(abs(.resid)),
            Max_APE = max(abs(.resid)),
            RMSE = sqrt(mean(.resid^2)),
            R2_val = cor(waist, .fitted)^2)

test_summary |> gt() |> fmt_number(decimals = 4) |> 
  tab_options(table.font.size = 28)
mod MAPE Max_APE RMSE R2_val
fit_01 5.8849 27.6462 7.3967 0.8191
fit_04 4.8477 26.4983 6.2469 0.8713
fit_09 4.7499 27.7847 6.1422 0.8755
fit_12 4.7504 27.8797 6.1418 0.8756

Multiple Imputation

We’d like to do 25 imputations

25 imputations on model fit_04 takes quite a while (7-10 minutes on my machine.)

pct_miss_case(nh_L)
[1] 23.14341
imp25_ests <- nh_L |>
  mice(m = 25, seed = 2024, print = FALSE) |>
  with(lm(waist ~ wt_kg + age + sex + educ)) |>
  pool()

Pooled Results for model fit_04

glance(imp25_ests)
  nimp nobs r.squared adj.r.squared
1   25 5669 0.8722111     0.8720756
tidy(imp25_ests, conf.int = TRUE, conf.level = 0.90) |>
  select(term, estimate, std.error, conf.low, conf.high, p.value) |>
  gt() |> fmt_number(decimals = 3, columns = c(-p.value)) 
term estimate std.error conf.low conf.high p.value
(Intercept) 30.488 0.537 29.604 31.372 0.000000e+00
wt_kg 0.720 0.004 0.713 0.727 0.000000e+00
age 0.211 0.005 0.203 0.220 1.653545e-283
sexF 3.695 0.174 3.408 3.982 9.271242e-95
educHSGrad −1.584 0.305 −2.085 −1.082 2.122695e-07
educSomeCollege −2.634 0.287 −3.106 −2.162 6.783848e-20
educCollegeGrad −3.934 0.279 −4.394 −3.474 5.797537e-44

Training Sample Results for fit_04

glance(fit_04) |> select(nobs, r.squared, adj.r.squared)
# A tibble: 1 × 3
   nobs r.squared adj.r.squared
  <int>     <dbl>         <dbl>
1  4000     0.871         0.871
tidy(fit_04, conf.int = TRUE, conf.level = 0.90) |>
  select(term, estimate, std.error, conf.low, conf.high, p.value) |>
  gt() |> fmt_number(decimals = 3, columns = c(-p.value)) 
term estimate std.error conf.low conf.high p.value
(Intercept) 30.159 0.609 29.158 31.161 0.000000e+00
wt_kg 0.717 0.005 0.710 0.725 0.000000e+00
age 0.216 0.006 0.206 0.226 1.486143e-239
sexF 3.712 0.205 3.375 4.049 1.432568e-70
educHSGrad −1.321 0.357 −1.908 −0.733 2.210386e-04
educSomeCollege −2.444 0.337 −2.998 −1.890 4.743961e-13
educCollegeGrad −3.746 0.328 −4.286 −3.205 1.166192e-29

Single Imputation Results for fit_04

For the entire sample after single imputation. Our model fit_04 uses wt_kg, age, sex and educ to predict waist. Our identifying code is in SEQN.

fit4_si <- lm(waist ~ wt_kg + age + sex + educ, data = nh_si)

n_obs(fit4_si)
[1] 5669

Single Imputation Results for fit_04

glance(fit4_si) |> select(nobs, r.squared, adj.r.squared)
# A tibble: 1 × 3
   nobs r.squared adj.r.squared
  <int>     <dbl>         <dbl>
1  5669     0.871         0.871
tidy(fit4_si, conf.int = TRUE, conf.level = 0.90) |>
  select(term, estimate, std.error, conf.low, conf.high, p.value) |>
  gt() |> fmt_number(decimals = 3, columns = c(-p.value)) 
term estimate std.error conf.low conf.high p.value
(Intercept) 30.599 0.513 29.755 31.443 0.000000e+00
wt_kg 0.718 0.004 0.712 0.724 0.000000e+00
age 0.213 0.005 0.204 0.221 0.000000e+00
sexF 3.716 0.172 3.433 4.000 3.985320e-99
educHSGrad −1.652 0.299 −2.144 −1.159 3.617931e-08
educSomeCollege −2.745 0.282 −3.209 −2.280 3.666489e-22
educCollegeGrad −4.007 0.276 −4.460 −3.553 5.009061e-47

Building “Complete Case” Data

Our model fit_04 uses wt_kg, age, sex and educ to predict waist. Our identifying code is in SEQN.

nh_cc <- nh_L |> select(SEQN, waist, wt_kg, age, sex, educ) |>
  drop_na()

fit4_cc <- lm(waist ~ wt_kg + age + sex + educ, data = nh_cc)

n_obs(fit4_cc)
[1] 5397

“Complete Case” Results for fit_04

glance(fit4_cc) |> select(nobs, r.squared, adj.r.squared)
# A tibble: 1 × 3
   nobs r.squared adj.r.squared
  <int>     <dbl>         <dbl>
1  5397     0.870         0.870
tidy(fit4_cc, conf.int = TRUE, conf.level = 0.90) |>
  select(term, estimate, std.error, conf.low, conf.high, p.value) |>
  gt() |> fmt_number(decimals = 3, columns = c(-p.value)) 
term estimate std.error conf.low conf.high p.value
(Intercept) 30.194 0.526 29.329 31.059 0.000000e+00
wt_kg 0.724 0.004 0.717 0.730 0.000000e+00
age 0.211 0.005 0.202 0.219 8.528098e-314
sexF 3.723 0.174 3.436 4.010 4.161630e-97
educHSGrad −1.606 0.305 −2.107 −1.104 1.442240e-07
educSomeCollege −2.669 0.287 −3.141 −2.197 1.964979e-20
educCollegeGrad −3.972 0.280 −4.432 −3.512 5.598585e-45

Our 4-predictor model

  • Use wt_kg, age, sex and educ to predict waist.
Method Sample \(R^2\) wt_kg (90% CI)
Complete Cases 5,397 0.870 0.724 (0.717, 0.730)
Training Sample 4,000 0.871 0.717 (0.710, 0.725)
Single Imp. (all) 5,669 0.871 0.718 (0.712, 0.724)
Multiple Imp. (25) 5,669 0.872 0.720 (0.713, 0.727)

Session Information

xfun::session_info()
R version 4.4.2 (2024-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22631)

Locale:
  LC_COLLATE=English_United States.utf8 
  LC_CTYPE=English_United States.utf8   
  LC_MONETARY=English_United States.utf8
  LC_NUMERIC=C                          
  LC_TIME=English_United States.utf8    

Package version:
  abind_1.4-8          AsioHeaders_1.22.1.2 askpass_1.2.1       
  backports_1.5.0      base64enc_0.1.3      bayestestR_0.15.0   
  bigD_0.3.0           bit_4.5.0            bit64_4.5.2         
  bitops_1.0.9         blob_1.2.4           boot_1.3-31         
  broom_1.0.7          bslib_0.8.0          cachem_1.1.0        
  callr_3.7.6          car_3.1-3            carData_3.0-5       
  cellranger_1.1.0     chromote_0.3.1       cli_3.6.3           
  clipr_0.8.0          coda_0.19-4.1        codetools_0.2-20    
  colorspace_2.1-1     commonmark_1.9.2     compiler_4.4.2      
  conflicted_1.2.0     correlation_0.8.6    cowplot_1.1.3       
  cpp11_0.5.0          crayon_1.5.3         curl_6.0.1          
  data.table_1.16.2    datasets_4.4.2       datawizard_0.13.0   
  DBI_1.2.3            dbplyr_2.5.0         Deriv_4.1.6         
  digest_0.6.37        doBy_4.6.24          dplyr_1.1.4         
  dtplyr_1.3.1         easystats_0.7.3      effectsize_0.8.9    
  emmeans_1.10.5       estimability_1.5.1   evaluate_1.0.1      
  fansi_1.0.6          farver_2.1.2         fastmap_1.2.0       
  fontawesome_0.5.3    forcats_1.0.0        foreach_1.5.2       
  foreign_0.8-87       Formula_1.2-5        fs_1.6.5            
  gargle_1.5.2         generics_0.1.3       GGally_2.2.1        
  ggplot2_3.5.1        ggrepel_0.9.6        ggstats_0.7.0       
  glmnet_4.1-8         glue_1.8.0           goftest_1.2-3       
  googledrive_2.1.1    googlesheets4_1.1.1  graphics_4.4.2      
  grDevices_4.4.2      grid_4.4.2           gridExtra_2.3       
  gt_0.11.1            gtable_0.3.6         haven_2.5.4         
  highr_0.11           hms_1.1.3            htmltools_0.5.8.1   
  htmlwidgets_1.6.4    httpuv_1.6.15        httr_1.4.7          
  ids_1.0.1            insight_0.20.5       isoband_0.2.7       
  iterators_1.0.14     janitor_2.2.0        jomo_2.7-6          
  jquerylib_0.1.4      jsonlite_1.8.9       juicyjuice_0.1.0    
  knitr_1.49           labeling_0.4.3       later_1.3.2         
  lattice_0.22-6       lifecycle_1.0.4      lme4_1.1-35.5       
  lubridate_1.9.3      magrittr_2.0.3       markdown_1.13       
  MASS_7.3-61          Matrix_1.7-1         MatrixModels_0.5.3  
  memoise_2.0.1        methods_4.4.2        mgcv_1.9-1          
  mice_3.16.0          microbenchmark_1.5.0 mime_0.12           
  minqa_1.2.8          mitml_0.4-5          modelbased_0.8.9    
  modelr_0.1.11        multcomp_1.4-26      munsell_0.5.1       
  mvtnorm_1.3-2        naniar_1.1.0         nhanesA_1.1         
  nlme_3.1-166         nloptr_2.1.1         nnet_7.3-19         
  norm_1.0-11.1        nortest_1.0-4        numDeriv_2016.8.1.1 
  olsrr_0.6.1          openssl_2.2.2        ordinal_2023.12.4.1 
  pan_1.9              parallel_4.4.2       parameters_0.23.0   
  patchwork_1.3.0      pbkrtest_0.5.3       performance_0.12.4  
  pillar_1.9.0         pkgconfig_2.0.3      plyr_1.8.9          
  prettyunits_1.2.0    processx_3.8.4       progress_1.2.3      
  promises_1.3.0       ps_1.8.1             purrr_1.0.2         
  quantreg_5.99        R6_2.5.1             ragg_1.3.3          
  rappdirs_0.3.3       RColorBrewer_1.1-3   Rcpp_1.0.13-1       
  RcppEigen_0.3.4.0.2  reactable_0.4.4      reactR_0.6.1        
  readr_2.1.5          readxl_1.4.3         rematch_2.0.0       
  rematch2_2.1.2       report_0.5.9         reprex_2.1.1        
  rlang_1.1.4          rmarkdown_2.29       rpart_4.1.23        
  rstudioapi_0.17.1    rvest_1.0.4          sandwich_3.1-1      
  sass_0.4.9           scales_1.3.0         see_0.9.0           
  selectr_0.4.2        shape_1.4.6.1        shiny_1.9.1         
  snakecase_0.11.1     sourcetools_0.1.7.1  SparseM_1.84.2      
  splines_4.4.2        stats_4.4.2          stringi_1.8.4       
  stringr_1.5.1        survival_3.7-0       sys_3.4.3           
  systemfonts_1.1.0    textshaping_0.4.0    TH.data_1.1-2       
  tibble_3.2.1         tidyr_1.3.1          tidyselect_1.2.1    
  tidyverse_2.0.0      timechange_0.3.0     tinytex_0.54        
  tools_4.4.2          tzdb_0.4.0           ucminf_1.2.2        
  UpSetR_1.4.0         utf8_1.2.4           utils_4.4.2         
  uuid_1.2.1           V8_6.0.0             vctrs_0.6.5         
  viridis_0.6.5        viridisLite_0.4.2    visdat_0.6.0        
  vroom_1.6.5          websocket_1.4.2      withr_3.0.2         
  xfun_0.49            xml2_1.3.6           xplorerr_0.2.0      
  xtable_1.8-4         yaml_2.3.10          zoo_1.8-12